** Transcriptional Factor analysis in neighborhood Zone (TFinZONE) **

General Information of this Script

                                       ## **Aim of the Script** 

######### Transcriptional Factor analysis in neighborhood Zone (TFinZONE) ###############################################
##### This Script aims to analyze the DNA-Motifs associated with the chromatin States areas as scRNA-seq data. Instead of an expression of genes, we propose an enrichment score. The multiplication of the motif score on the Peak and the log10-Pvalue of the motif in the chromatin state of interest.
## Input files
## 

## **Input files**
   ##### 08MOTIFS_on_PEAKS
      - Folder containing all the output results from homer motifs of all Peaks files used in the program.

   ##### 08TSS_Enh_w_01dpci_fHeatmap_ALL.xlsx
      - Matrix file, Containing the log10Pvalue from each motifs.
    
  ## **Output files**
    #### 03OUTPUT
        - Folder containing all the plots used in figure 3. As well as additional output from the Script

Set working directories and Project name

 workdir = "./"
setwd(workdir)

PTHA="../03OUTPUT/"
dir.create(PTHA)
PROJECT="03a_TFinZONE_at_01dpci"

PORT=paste(PTHA,PROJECT,"/",sep="")
PTHA1=paste(PTHA,PROJECT,"/",sep="")

dir.create(PTHA)
dir.create(PORT)
dir.create(PTHA1)
TRY="_"
TRY1="02Ia_"
PVALUE=0.05
#### FOR VOLCANO ###
upcol<- "#B2182B" # magenta from PiyG
nc<- "#000000" # black
downcol<- "grey" # green from PiyG
CC= c(downcol, "#F7F7F7",upcol)
CO3<- c("lightgrey","#B2182B")
CO_ALU3=c("#d7191c","#d8b365","#542788","grey", "#91bfdb", "grey","grey")

NAME33<- c("Mo_Name","Consensus","P-value","Log_P-value","q-value_Benja","Nor_of_Tar","Per_of_Tar","No_of_Tar_Backg","Per_of_Tar_Backg")
NAME34<- c("HOmer_NAME",NAME33)
mat_m<- data.frame("mask_500_FIND")
## Here you write the Path to the 
WORK1="../01DATA_ORI/08MOTIFS_on_PEAKS/"

#WORK1="Y:/002ZF/000FIGURES_ANALY/00GITHUB_01HIS_Chroma_Factors/01HIS_Chrom_Factors_TEST/04MOtifs_Analy_as_SC_MOAC_Fig3_5/02Fig4_MO_MOAC_01dpci/01DATA_ORI/08MOTIFS_on_PEAKS/"
NAME_G2 <-data.frame(list.files(path=WORK1,pattern="*_FIND"))

colnames(NAME_G2)<- "ENH4"
#NAME_G2$GROUP <- gsub("CORR1_01d_", "", NAME_G2$ENH4)
NAME_G2$GROUP <- gsub("*_mask_500_FIND", "", NAME_G2$ENH4)
### this is important to check the SYMBOL
NAME_G22<- data.frame(c("00A_Ia","01Ia_A","02U_Ea","03U_Ia","04U_Pa"))


NAME_G2c<- as.character(t(NAME_G2[,2]))

#WORK11A11="../01DATA_ORI/07aMO_forMOAC/08TSS_Enh_w_01dpci_fHeatmap_ALL.xlsx"
WORK11A11="../01DATA_ORI/07aMO_forSeurat_ANALY/08TSS_Enh_w_01dpci_fHeatmap_ALL.xlsx"

MAT_1A= read_excel(WORK11A11)
colnames(MAT_1A)[1]<- "SYMBOL"

dim(MAT_1A)
## [1] 326  11
##### Set resolution for CLustering
RES=0.5

Set processors

## sequential:
## - args: function (..., envir = parent.frame())
## - tweaked: FALSE
## - call: NULL
## sequential:
## - args: function (..., workers = 16, envir = parent.frame())
## - tweaked: TRUE
## - call: plan("sequential", workers = 16)

Load motifs to make the name in the matrix

gather the matrix from all the TFs found in this stage

MERGE MOTIFS vs SIG_MO from ALL MOTIFS

## 
## 00A_Ia_01dcpi 01Ia_A_01dcpi 02U_Ea_01dcpi 03U_Ia_01dcpi 04U_Pa_01dcpi 
##        278786           117          6814         66147         12513
## [1] 3

AS SINGLE CELL PREPARE MATRIX

Create a Seurat object

b7<- AC_RE_VOL_SC_META
rownames(b7)<- b7$PositionID2

mat_c1<- mat_cm_TMM_BOTH_SC[,rownames(b7)]
rownames(mat_c1)<- mat_cm_TMM_BOTH_SC$SYMBOL
dim(mat_c1)
## [1]  207 3090
dim(b7)
## [1] 3090    5
smartseq2 <- CreateSeuratObject(mat_c1,project = "01d", meta.data = b7)

Identification of highly variable features

## png 
##   2

PLot from TOP TFs bound in the data

arrangeQC_TF <- ggarrange(EC00,EC001, ncol = 2,nrow =1,common.legend = T, align = c("hv"),legend="none")

print(arrangeQC_TF)

grid.arrange(  plot1,plot2, ncol = 1,nrow = 2)

Scaling the data

## Centering and scaling data matrix

Perform linear dimensional reduction

## PC_ 1 
## Positive:  THRb, Hoxd12, Tbx5, Smad3, PR, Hoxd11, Eomes, Sox3, Olig2, Sox21 
##     Hoxa9, Hoxa11, Sox10, Hoxd10, Twist2, Smad2, GATA3, NeuroG2, Erra, TRPS1 
##     Nanog, CEBP, ERG, Smad4, EAR2, ETS1, Pdx1, BHLHA15, CDX4, PBX2 
## Negative:  Isl1, bHLHE41, Npas4, Gfi1b, Tbet, Tbx21, Otx2, CHR, Hoxa10, ZNF7 
##     Elk1, HNF6, Cux2, Unknown, PRDM10, SCL, IRF3, Oct6, Max, ELF1 
##     MYNN, HINFP, Tbox, CRE, STAT1, GATA, E2F3, TCFL2, JunD, NRF 
## PC_ 2 
## Positive:  MNT, BMAL1, NPAS2, RUNX2, RUNX, TEAD, NPAS, MITF, Twist2, CLOCK 
##     RUNX1, CEBP, TEAD3, THRb, HIC1, Zic, PBX2, Atf3, Eomes, Hoxc9 
##     Sox3, TEAD2, Olig2, USF1, Usf2, Erra, CDX4, BHLHA15, SF1, Fos 
## Negative:  ETV4, Fli1, ETV1, GABPA, Elf4, Etv2, ERG, ELF3, EWS, EHF 
##     ETS1, ELF5, SPDEF, Elk4, Elk1, ETS, ELF1, SpiB, SCL, Npas4 
##     Meis1, bHLHE41, Isl1, FoxL2, Foxo1, Tbet, IRF8, Tbx21, GSC, Tgif1 
## PC_ 3 
## Positive:  Isl1, Nanog, Tbet, Tbx21, Pitx1, Bapx1, Tbr1, Sox2, RARa, Etv2 
##     TRPS1, Hoxa10, Bcl6, Otx2, Gata2, Sox6, Gata1, EWS, Gfi1b, CHR 
##     ZNF7, Hoxa9, Gata4, Sox15, ELF3, ERG, Gata6, SCL, GATA3, ETS1 
## Negative:  Foxo3, FoxL2, Foxf1, MYB, FOXK1, AMYB, Npas4, Foxo1, HNF6, FOXP1 
##     Hoxd13, Foxa3, FOXA1, Cux2, Unknown, FOXK2, Elk4, ELF1, Cdx2, Elk1 
##     Foxa2, Oct6, BMYB, ETS, CRE, MYNN, CDX4, Meis1, Hoxa11, FOXM1 
## PC_ 4 
## Positive:  BATF, Atf3, Fos, Fra1, JunB, Fra2, Fosl2, ETV1, ETV4, HIC1 
##     GABPA, Elf4, MYB, ETS1, Etv2, Max, ERG, Fli1, Elk4, RUNX2 
##     OCT, Pdx1, Hoxc9, Smad4, ETS, ELF3, ELF1, E2F3, PBX2, EHF 
## Negative:  FOXM1, FOXA1, Foxf1, FOXP1, Pitx1, Fox, FoxL2, FOXK2, SCL, Foxa2 
##     FOXK1, Foxo3, Isl1, GSC, Foxo1, Tbet, Tbx21, FoxD3, Otx2, Foxa3 
##     Nanog, Gata2, Bcl6, ZNF7, Gata1, TRPS1, Hoxa10, Hoxa13, Tbr1, CHR 
## PC_ 5 
## Positive:  Hoxd13, CDX4, Hoxd11, Sox3, Cdx2, Hoxd12, MYB, Sox17, Hoxa11, ZNF711 
##     Smad3, Sox7, AMYB, Hoxd10, Elk4, Elf4, PAX5, HIF2a, Tbx5, Sox21 
##     Smad2, Smad4, Olig2, Sox10, Erra, NFIL3, THRb, RXR, PGR, Unknown 
## Negative:  Fra1, Fra2, Atf3, JunB, BATF, Fos, Fosl2, Fox, FOXP1, Foxa2 
##     FOXA1, FOXM1, Foxf1, Foxo3, FoxL2, FOXK2, FOXK1, Foxa3, Isl1, Pitx1 
##     Tbet, SCL, Tbx21, FoxD3, Otx2, GSC, STAT4, ZNF7, Bcl6, Foxo1

Plot PCA

PLot Pincipal component analysis (PCA)

grid.arrange(  P1, ncol = 1,nrow = 1)

Plot heatmap from 200 Peaks

PLot Pincipal component analysis CA from TOP TFs bound in the data

print(P2)

PCA and checking of the PC which is similar to say how many cluster of cells are in the data

## png 
##   2

PLot to identify how many cluster in the Data

#grid.arrange(  P3, ncol = 1,nrow = 1)

Perform Unsupervised Clusters

set.seed(1)
smartseq2 <- FindNeighbors(smartseq2, dims = 1:10)
smartseq2 <- FindClusters(smartseq2, resolution = RES, random.seed= 1, algorithm=1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 3090
## Number of edges: 94891
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8747
## Number of communities: 11
## Elapsed time: 0 seconds

#########here

Plot UMAP to find Neighborg to use for the DATA

## png 
##   2
## png 
##   2
Plot UMAP and TSNE PLots
print(arrange1t)

print(arrange1)

print(arrange1bt)

print(arrange1c)

print(arrange1cat)

## png 
##   2

Plot UMAP and TSNE PLots by GROUP (Divergent areas)

print(arrange3)

print(arrange3t)

print(arrange2)

Find markers for every cluster 0% in clusters and 0.20 fc

Plot Barplot, number of peaks per cluster and Proteins per Divergent areas

grid.arrange(  P20,P201, ncol = 1,nrow = 2)

Heatmap of the top 10 markers after the Binomil TEST

Plot Barplot, number of peaks per cluster and Proteins per Divergent areas

grid.arrange(  P05ah_bino1, ncol = 1,nrow = 1)

grid.arrange(  P05ah_bino, ncol = 1,nrow = 1)

save output after UMAP

Extract analized matrix from Seurat objects

DATA_SC<- data.frame(smartseq2@assays[["RNA"]]@counts@i)
DATA_SC<- data.frame(smartseq2@assays[["RNA"]]@counts@i)
DATA_SC<- GetAssay(smartseq2,assay = "RNA")
DATA_SC1<- data.frame(DATA_SC@scale.data)
DATA_SC2<- data.frame(DATA_SC@scale.data)
#dim(DATA_SC1)
META_SEU<- data.frame(smartseq2@meta.data)
colnames(DATA_SC1)<- META_SEU$PositionID
colnames(META_SEU)[4]<- "SYMBOL_TF"

Selection of candidate genes from 01dpci marker list. This is performed after checking the output excel file

GENES001<- c("Twist2","Smad4", "Nanog","Smad2","Fos","Sox9","Sox10","JunB")
GENES002 <- c("Pitx1","Isl1",  "Mef2b","FOXA1","TEAD3","GATA3", "Gata6", "Mef2b")
GENES003 <- c("Eomes","Oct4","Foxh1","CUX1","Bapx1", "Elk4","Foxo1","Fli1")
GENES004<- c("Rbp7","Rgcc", "Fabp4","Egfl7","Flt1", "Cd36","Nrp1","Tpm1", "Rbp1")

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[8]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[8]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## png 
##   2
## png 
##   2

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[8]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## 
## [[6]]
## 
## [[7]]
## 
## [[8]]
## 
## [[1]]
## 
## [[2]]
## 
## [[3]]
## 
## [[4]]
## 
## [[5]]
## png 
##   2
## png 
##   2

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

print(arrange12)

Plot several plot from genes in each cluster

## png 
##   2
## png 
##   2

UMAP from the selected genes

print(P05a)

print(P05b)

print(P05c)

TSNE from the selected genes

print(P05a1)

print(P05b1)

print(P05c1)

VIolin plot by Clusters found in the script

VIolin plot splited by seurat cluster and grouped by GROUP (00A_Ia, 01Ia_A, 02U_Ea, 03U_Ia, 04U_Pa)

#grid.arrange(  P04a1v, ncol = 1,nrow = 1)
#grid.arrange(  P04b1v, ncol = 1,nrow = 1)
#grid.arrange(  P04c1v, ncol = 1,nrow = 1)

Violin PLot of selected Markers

## png 
##   2
## png 
##   2
print(arrange1CM41)

print(arrange1CM42)

print(arrange1CM43)